Reading and Preparing Data

library(tidyverse)
library(scales)
library(viridis)
library(stats)
library(corrplot)
library(car)
library(cluster)
library(plotly)

# Load TB dataset
tb_data <- read.csv("TB_Burden_Country.csv", stringsAsFactors = FALSE)

# Filter for latest available year in dataset
latest_year <- max(tb_data$Year, na.rm = TRUE)
tb_latest <- tb_data %>% filter(Year == latest_year)

# Select and clean relevant columns
tb_clean <- tb_latest %>%
  select(
    country = `Country.or.territory.name`,
    population = `Estimated.total.population.number`,
    prevalence_per_100k = `Estimated.prevalence.of.TB..all.forms..per.100.000.population`,
    mortality_per_100k = `Estimated.mortality.of.TB.cases..all.forms..excluding.HIV..per.100.000.population`,
    incidence_per_100k = `Estimated.incidence..all.forms..per.100.000.population`,
    hiv_percent = `Estimated.HIV.in.incident.TB..percent.`,
    case_detection_rate = `Case.detection.rate..all.forms...percent`
  ) %>%
  drop_na()

# Rename long country name
tb_clean$country[tb_clean$country == "Democratic People's Republic of Korea"] <- "Korea"

Correlation Analysis

numeric_vars <- tb_clean %>% 
  select(prevalence_per_100k, mortality_per_100k, incidence_per_100k, hiv_percent, case_detection_rate)

correlation_matrix <- cor(numeric_vars, use = "complete.obs")
corrplot(correlation_matrix, method = "color", 
         type = "upper", order = "hclust",
         addCoef.col = "black", number.cex = 0.7)

Regression Analysis

mortality_model <- lm(mortality_per_100k ~ incidence_per_100k, data = tb_clean)
model_summary <- summary(mortality_model)
conf_intervals <- confint(mortality_model)

regression_plot <- ggplot(tb_clean, aes(x = incidence_per_100k, y = mortality_per_100k)) +
  geom_point() +
  geom_smooth(method = "lm", se = TRUE, color = "darkred") +
  theme_minimal() +
  labs(title = "TB Mortality vs. Incidence with Confidence Interval",
       subtitle = paste("R² =", round(model_summary$r.squared, 3)),
       x = "Incidence per 100k", y = "Mortality per 100k")

regression_plot

model_summary
## 
## Call:
## lm(formula = mortality_per_100k ~ incidence_per_100k, data = tb_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -39.892  -4.555  -3.387   1.407  60.303 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        3.175168   1.243672   2.553   0.0117 *  
## incidence_per_100k 0.090302   0.005303  17.029   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.45 on 151 degrees of freedom
## Multiple R-squared:  0.6576, Adjusted R-squared:  0.6553 
## F-statistic:   290 on 1 and 151 DF,  p-value: < 2.2e-16
conf_intervals
##                         2.5 %    97.5 %
## (Intercept)        0.71792201 5.6324141
## incidence_per_100k 0.07982503 0.1007794

K-Means Clustering

scaled_data <- scale(numeric_vars)
kmeans_result <- kmeans(scaled_data, centers = 3, nstart = 25)

tb_clustered <- tb_clean %>%
  mutate(cluster = as.factor(kmeans_result$cluster))

cluster_stats <- tb_clustered %>%
  group_by(cluster) %>%
  summarize(
    count = n(),
    mean_incidence = mean(incidence_per_100k),
    mean_mortality = mean(mortality_per_100k),
    mean_hiv_percent = mean(hiv_percent),
    mean_detection = mean(case_detection_rate)
  )

cluster_stats
## # A tibble: 3 × 6
##   cluster count mean_incidence mean_mortality mean_hiv_percent mean_detection
##   <fct>   <int>          <dbl>          <dbl>            <dbl>          <dbl>
## 1 1         110           54.2           4.68             8.67           79.7
## 2 2          32          250.           39.2             11.1            58.7
## 3 3          11          646.           56.2             55.3            53.8

Top 10 TB Incidence Rates

tb_top_10 <- tb_clean %>% arrange(desc(incidence_per_100k)) %>% slice(1:10)  
ggplot(tb_top_10, aes(x = reorder(country, -incidence_per_100k), y = incidence_per_100k)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
  labs(title = "Top 10 TB Incidence Rate by Country", x = "Country", y = "Incidence Rate per 100,000")

Interactive Cluster Plot

plotly_cluster <- ggplot(tb_clustered, aes(x = incidence_per_100k, y = mortality_per_100k, color = cluster, text = country)) +
  geom_point(size = 3) +
  theme_minimal()

ggplotly(plotly_cluster, tooltip = "text")